summary of the dataframe mygenes
summary(mygenes)
## transcript_type feature chr
## protein_coding :2001054 exon :1306656 1 : 238010
## nonsense_mediated_decay: 293471 CDS : 791856 2 : 189916
## processed_transcript : 173401 UTR : 304070 17 : 166529
## retained_intron : 150034 transcript : 215170 19 : 163304
## lincRNA : 55928 stop_codon : 73411 3 : 159475
## antisense : 45811 start_codon: 73358 11 : 157597
## (Other) : 108613 (Other) : 63791 (Other):1753481
## start end
## Min. : 577 Min. : 647
## 1st Qu.: 31698816 1st Qu.: 31700419
## Median : 56565463 Median : 56566763
## Mean : 73148763 Mean : 73152067
## 3rd Qu.:108204790 3rd Qu.:108206944
## Max. :249230780 Max. :249231242
##
genes_summary <- data.frame(unclass(summary(mygenes)), check.names = FALSE, stringsAsFactors = FALSE, row.names = NULL)
genes_summary
subsetting data
print(head(mygenes[,c(2,4)], 50))
## feature start
## 1 gene 11869
## 2 transcript 11869
## 3 exon 11869
## 4 exon 12613
## 5 exon 13221
## 6 transcript 11872
## 7 exon 11872
## 8 exon 12613
## 9 exon 13225
## 10 transcript 11874
## 11 exon 11874
## 12 exon 12595
## 13 exon 13403
## 14 exon 13661
## 15 transcript 12010
## 16 exon 12010
## 17 exon 12179
## 18 exon 12613
## 19 exon 12975
## 20 exon 13221
## 21 exon 13453
## 22 gene 14363
## 23 transcript 14363
## 24 exon 29321
## 25 exon 24738
## 26 exon 18268
## 27 exon 17915
## 28 exon 17602
## 29 exon 17233
## 30 exon 16854
## 31 exon 16607
## 32 exon 15904
## 33 exon 15796
## 34 exon 14970
## 35 exon 14363
## 36 transcript 14363
## 37 exon 24734
## 38 exon 18268
## 39 exon 17915
## 40 exon 17606
## 41 exon 17498
## 42 exon 17233
## 43 exon 16854
## 44 exon 14970
## 45 exon 14363
## 46 transcript 14363
## 47 exon 29321
## 48 exon 24738
## 49 exon 17915
## 50 exon 17606
print(head(mygenes[,c(2:4)], 50))
## feature chr start
## 1 gene 1 11869
## 2 transcript 1 11869
## 3 exon 1 11869
## 4 exon 1 12613
## 5 exon 1 13221
## 6 transcript 1 11872
## 7 exon 1 11872
## 8 exon 1 12613
## 9 exon 1 13225
## 10 transcript 1 11874
## 11 exon 1 11874
## 12 exon 1 12595
## 13 exon 1 13403
## 14 exon 1 13661
## 15 transcript 1 12010
## 16 exon 1 12010
## 17 exon 1 12179
## 18 exon 1 12613
## 19 exon 1 12975
## 20 exon 1 13221
## 21 exon 1 13453
## 22 gene 1 14363
## 23 transcript 1 14363
## 24 exon 1 29321
## 25 exon 1 24738
## 26 exon 1 18268
## 27 exon 1 17915
## 28 exon 1 17602
## 29 exon 1 17233
## 30 exon 1 16854
## 31 exon 1 16607
## 32 exon 1 15904
## 33 exon 1 15796
## 34 exon 1 14970
## 35 exon 1 14363
## 36 transcript 1 14363
## 37 exon 1 24734
## 38 exon 1 18268
## 39 exon 1 17915
## 40 exon 1 17606
## 41 exon 1 17498
## 42 exon 1 17233
## 43 exon 1 16854
## 44 exon 1 14970
## 45 exon 1 14363
## 46 transcript 1 14363
## 47 exon 1 29321
## 48 exon 1 24738
## 49 exon 1 17915
## 50 exon 1 17606
print(head(mygenes[mygenes$chr == 1, c(1:5)], 50))
## transcript_type feature chr start end
## 1 pseudogene gene 1 11869 14412
## 2 processed_transcript transcript 1 11869 14409
## 3 processed_transcript exon 1 11869 12227
## 4 processed_transcript exon 1 12613 12721
## 5 processed_transcript exon 1 13221 14409
## 6 transcribed_unprocessed_pseudogene transcript 1 11872 14412
## 7 transcribed_unprocessed_pseudogene exon 1 11872 12227
## 8 transcribed_unprocessed_pseudogene exon 1 12613 12721
## 9 transcribed_unprocessed_pseudogene exon 1 13225 14412
## 10 transcribed_unprocessed_pseudogene transcript 1 11874 14409
## 11 transcribed_unprocessed_pseudogene exon 1 11874 12227
## 12 transcribed_unprocessed_pseudogene exon 1 12595 12721
## 13 transcribed_unprocessed_pseudogene exon 1 13403 13655
## 14 transcribed_unprocessed_pseudogene exon 1 13661 14409
## 15 transcribed_unprocessed_pseudogene transcript 1 12010 13670
## 16 transcribed_unprocessed_pseudogene exon 1 12010 12057
## 17 transcribed_unprocessed_pseudogene exon 1 12179 12227
## 18 transcribed_unprocessed_pseudogene exon 1 12613 12697
## 19 transcribed_unprocessed_pseudogene exon 1 12975 13052
## 20 transcribed_unprocessed_pseudogene exon 1 13221 13374
## 21 transcribed_unprocessed_pseudogene exon 1 13453 13670
## 22 pseudogene gene 1 14363 29806
## 23 unprocessed_pseudogene transcript 1 14363 29370
## 24 unprocessed_pseudogene exon 1 29321 29370
## 25 unprocessed_pseudogene exon 1 24738 24891
## 26 unprocessed_pseudogene exon 1 18268 18379
## 27 unprocessed_pseudogene exon 1 17915 18061
## 28 unprocessed_pseudogene exon 1 17602 17742
## 29 unprocessed_pseudogene exon 1 17233 17364
## 30 unprocessed_pseudogene exon 1 16854 17055
## 31 unprocessed_pseudogene exon 1 16607 16765
## 32 unprocessed_pseudogene exon 1 15904 15947
## 33 unprocessed_pseudogene exon 1 15796 15901
## 34 unprocessed_pseudogene exon 1 14970 15038
## 35 unprocessed_pseudogene exon 1 14363 14829
## 36 unprocessed_pseudogene transcript 1 14363 24886
## 37 unprocessed_pseudogene exon 1 24734 24886
## 38 unprocessed_pseudogene exon 1 18268 18369
## 39 unprocessed_pseudogene exon 1 17915 18061
## 40 unprocessed_pseudogene exon 1 17606 17742
## 41 unprocessed_pseudogene exon 1 17498 17504
## 42 unprocessed_pseudogene exon 1 17233 17364
## 43 unprocessed_pseudogene exon 1 16854 17055
## 44 unprocessed_pseudogene exon 1 14970 15038
## 45 unprocessed_pseudogene exon 1 14363 14829
## 46 unprocessed_pseudogene transcript 1 14363 29370
## 47 unprocessed_pseudogene exon 1 29321 29370
## 48 unprocessed_pseudogene exon 1 24738 24891
## 49 unprocessed_pseudogene exon 1 17915 18061
## 50 unprocessed_pseudogene exon 1 17606 17742
number of categories in categorical data
print(levels(mygenes$chr))
## [1] "1" "10"
## [3] "11" "12"
## [5] "13" "14"
## [7] "15" "16"
## [9] "17" "18"
## [11] "19" "2"
## [13] "20" "21"
## [15] "22" "3"
## [17] "4" "5"
## [19] "6" "7"
## [21] "8" "9"
## [23] "GL000191.1" "GL000192.1"
## [25] "GL000193.1" "GL000194.1"
## [27] "GL000195.1" "GL000196.1"
## [29] "GL000199.1" "GL000201.1"
## [31] "GL000204.1" "GL000205.1"
## [33] "GL000209.1" "GL000211.1"
## [35] "GL000212.1" "GL000213.1"
## [37] "GL000215.1" "GL000216.1"
## [39] "GL000218.1" "GL000219.1"
## [41] "GL000220.1" "GL000221.1"
## [43] "GL000222.1" "GL000223.1"
## [45] "GL000224.1" "GL000225.1"
## [47] "GL000228.1" "GL000229.1"
## [49] "GL000230.1" "GL000231.1"
## [51] "GL000233.1" "GL000236.1"
## [53] "GL000237.1" "GL000240.1"
## [55] "GL000241.1" "GL000242.1"
## [57] "GL000243.1" "GL000247.1"
## [59] "HG1007_PATCH" "HG1032_PATCH"
## [61] "HG104_HG975_PATCH" "HG1063_PATCH"
## [63] "HG1074_PATCH" "HG1079_PATCH"
## [65] "HG1082_HG167_PATCH" "HG1091_PATCH"
## [67] "HG1133_PATCH" "HG1146_PATCH"
## [69] "HG115_PATCH" "HG1208_PATCH"
## [71] "HG1211_PATCH" "HG122_PATCH"
## [73] "HG1257_PATCH" "HG1287_PATCH"
## [75] "HG1292_PATCH" "HG1293_PATCH"
## [77] "HG1304_PATCH" "HG1308_PATCH"
## [79] "HG1322_PATCH" "HG1350_HG959_PATCH"
## [81] "HG14_PATCH" "HG142_HG150_NOVEL_TEST"
## [83] "HG1423_PATCH" "HG1424_PATCH"
## [85] "HG1425_PATCH" "HG1426_PATCH"
## [87] "HG1433_PATCH" "HG1434_PATCH"
## [89] "HG1435_PATCH" "HG1436_HG1432_PATCH"
## [91] "HG1437_PATCH" "HG1438_PATCH"
## [93] "HG1439_PATCH" "HG144_PATCH"
## [95] "HG1440_PATCH" "HG1441_PATCH"
## [97] "HG1442_PATCH" "HG1443_HG1444_PATCH"
## [99] "HG1453_PATCH" "HG1458_PATCH"
## [101] "HG1459_PATCH" "HG1462_PATCH"
## [103] "HG1463_PATCH" "HG1472_PATCH"
## [105] "HG1479_PATCH" "HG1486_PATCH"
## [107] "HG1487_PATCH" "HG1488_PATCH"
## [109] "HG1490_PATCH" "HG1497_PATCH"
## [111] "HG1500_PATCH" "HG1501_PATCH"
## [113] "HG1502_PATCH" "HG151_NOVEL_TEST"
## [115] "HG1591_PATCH" "HG1592_PATCH"
## [117] "HG1595_PATCH" "HG1699_PATCH"
## [119] "HG174_HG254_PATCH" "HG183_PATCH"
## [121] "HG185_PATCH" "HG186_PATCH"
## [123] "HG19_PATCH" "HG193_PATCH"
## [125] "HG237_PATCH" "HG243_PATCH"
## [127] "HG256_PATCH" "HG27_PATCH"
## [129] "HG271_PATCH" "HG280_PATCH"
## [131] "HG281_PATCH" "HG29_PATCH"
## [133] "HG299_PATCH" "HG305_PATCH"
## [135] "HG306_PATCH" "HG311_PATCH"
## [137] "HG325_PATCH" "HG329_PATCH"
## [139] "HG339_PATCH" "HG344_PATCH"
## [141] "HG348_PATCH" "HG357_PATCH"
## [143] "HG375_PATCH" "HG385_PATCH"
## [145] "HG388_HG400_PATCH" "HG414_PATCH"
## [147] "HG417_PATCH" "HG418_PATCH"
## [149] "HG444_PATCH" "HG480_HG481_PATCH"
## [151] "HG497_PATCH" "HG50_PATCH"
## [153] "HG506_HG507_HG1000_PATCH" "HG531_PATCH"
## [155] "HG536_PATCH" "HG544_PATCH"
## [157] "HG686_PATCH" "HG7_PATCH"
## [159] "HG706_PATCH" "HG729_PATCH"
## [161] "HG730_PATCH" "HG736_PATCH"
## [163] "HG745_PATCH" "HG747_PATCH"
## [165] "HG748_PATCH" "HG75_PATCH"
## [167] "HG79_PATCH" "HG858_PATCH"
## [169] "HG865_PATCH" "HG871_PATCH"
## [171] "HG873_PATCH" "HG883_PATCH"
## [173] "HG905_PATCH" "HG944_PATCH"
## [175] "HG946_PATCH" "HG953_PATCH"
## [177] "HG957_PATCH" "HG962_PATCH"
## [179] "HG971_PATCH" "HG979_PATCH"
## [181] "HG987_PATCH" "HG989_PATCH"
## [183] "HG990_PATCH" "HG991_PATCH"
## [185] "HG996_PATCH" "HG998_1_PATCH"
## [187] "HG998_2_PATCH" "HG999_1_PATCH"
## [189] "HG999_2_PATCH" "HSCHR1_1_CTG31"
## [191] "HSCHR1_2_CTG31" "HSCHR1_3_CTG31"
## [193] "HSCHR10_1_CTG2" "HSCHR10_1_CTG5"
## [195] "HSCHR12_1_CTG1" "HSCHR12_1_CTG2_1"
## [197] "HSCHR12_1_CTG5" "HSCHR12_2_CTG2"
## [199] "HSCHR12_2_CTG2_1" "HSCHR12_3_CTG2_1"
## [201] "HSCHR15_1_CTG4" "HSCHR15_1_CTG8"
## [203] "HSCHR16_1_CTG3_1" "HSCHR16_2_CTG3_1"
## [205] "HSCHR17_1" "HSCHR17_1_CTG1"
## [207] "HSCHR17_1_CTG4" "HSCHR17_2_CTG4"
## [209] "HSCHR17_3_CTG4" "HSCHR17_4_CTG4"
## [211] "HSCHR17_5_CTG4" "HSCHR17_6_CTG4"
## [213] "HSCHR18_1_CTG1_1" "HSCHR18_1_CTG2_1"
## [215] "HSCHR18_2_CTG2" "HSCHR18_2_CTG2_1"
## [217] "HSCHR19_1_CTG3" "HSCHR19_1_CTG3_1"
## [219] "HSCHR19_2_CTG3" "HSCHR19_3_CTG3"
## [221] "HSCHR19LRC_COX1_CTG1" "HSCHR19LRC_COX2_CTG1"
## [223] "HSCHR19LRC_LRC_I_CTG1" "HSCHR19LRC_LRC_J_CTG1"
## [225] "HSCHR19LRC_LRC_S_CTG1" "HSCHR19LRC_LRC_T_CTG1"
## [227] "HSCHR19LRC_PGF1_CTG1" "HSCHR19LRC_PGF2_CTG1"
## [229] "HSCHR2_1_CTG1" "HSCHR2_1_CTG12"
## [231] "HSCHR2_2_CTG12" "HSCHR20_1_CTG1"
## [233] "HSCHR21_2_CTG1_1" "HSCHR21_3_CTG1_1"
## [235] "HSCHR21_4_CTG1_1" "HSCHR22_1_CTG1"
## [237] "HSCHR22_1_CTG2" "HSCHR22_2_CTG1"
## [239] "HSCHR3_1_CTG1" "HSCHR3_1_CTG2_1"
## [241] "HSCHR4_1" "HSCHR4_1_CTG12"
## [243] "HSCHR4_1_CTG6" "HSCHR4_2_CTG9"
## [245] "HSCHR5_1_CTG1" "HSCHR5_1_CTG2"
## [247] "HSCHR5_1_CTG5" "HSCHR5_2_CTG1"
## [249] "HSCHR5_3_CTG1" "HSCHR6_1_CTG5"
## [251] "HSCHR6_MHC_APD" "HSCHR6_MHC_COX"
## [253] "HSCHR6_MHC_DBB" "HSCHR6_MHC_MANN"
## [255] "HSCHR6_MHC_MCF" "HSCHR6_MHC_QBL"
## [257] "HSCHR6_MHC_SSTO" "HSCHR7_1_CTG6"
## [259] "HSCHR9_1_CTG1" "HSCHR9_1_CTG35"
## [261] "HSCHR9_2_CTG35" "HSCHR9_3_CTG35"
## [263] "MT" "X"
## [265] "Y"
creating a graph with a subset of data and then making interactive plots
#install.packages("ggplot2") The intsall libraries have been commented out but the '#' symbol
#install.packages("plotly") can be removed to install if not already installed
library(ggplot2) # We load in libraries
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# We define a factor set of autosomes
autosomes<-c("1","2","3","4","5","6","7","8","9","10",
"11","12","13","14","15","16","17","18","19","20","21","22")
# We create a new dataframe called 'genes' which only has autosomes, however we still have those additional factors
genes<- mygenes[ which(mygenes$chr %in% autosomes), ]
genes
# We remove the factors and then order them so they are numerical order and not alphabetical
genes$chr <- factor(genes$chr, levels = autosomes)
# We plot in ggplot
p<-ggplot(data = genes) + geom_bar(mapping = aes(x = chr, fill = feature), width = 1)
ggplotly(p)
ggplot(data = genes) + geom_bar(mapping = aes(x = chr, fill = feature), width = 1) + coord_polar()
